In [1]:
from setup import *
import sys
if DATA_PATH not in sys.path: sys.path.append(DATA_PATH)
from constants import *
%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 12)
pd.set_option('display.max_columns', 200)
In [2]:
df = pd.read_csv(os.path.join(DATA_PATH, 'deduped_tweets.csv.gz'), low_memory=False)
In [3]:
df = df.drop_duplicates('id', keep='last')[['id', 'id_str', 'text']]
df.id == df.id_str
(df.id != df.id_str).sum()
Out[3]:
In [4]:
df = df[['id', 'text']]
In [5]:
df.text
Out[5]:
In [6]:
df['tokens'] = df.text.str.split()
df
Out[6]:
In [7]:
df['tokens'] = df.text.str.replace(url, '').str.split()
df
Out[7]:
In [8]:
df['txt'] = df.text.str.replace(url, ' ').str.replace(r'\W+', ' ').str.replace(r'\s+', ' ')
df.txt
Out[8]:
In [9]:
df['txt'] = df.txt.str.replace(r'\d+', ' ').str.replace(r'\s+', ' ')
df['tokens'] = df.txt.str.split()
df
Out[9]:
Notice that we trounced the hashtag #Python
That's not good.
Can you fix it?
Anything else we might be messing up?
what other punctuation marks have special meaning in Tweets
In [10]:
# improve on the "stopword" filters here
#
# :-) (ask me about a smilie lexicon)
# not-so-simple words? (ask me about a regex for compound words)
# python variables names with underscores? (regex)
In [11]:
f = os.path.join(DATA_PATH, 'text.csv.gz')
df.to_csv(f, encoding='utf8', compression='gzip', quoting=pd.io.common.csv.QUOTE_NONNUMERIC)
In [12]:
import gzip
with gzip.open(os.path.join(DATA_PATH, 'text.csv.gz'), 'rb') as f:
df = pd.read_csv(f)
Make sure you can read it back in!
In [13]:
df = pd.DataFrame.from_csv(os.path.join(DATA_PATH, 'text.csv.gz'))
df
Out[13]:
In [ ]: